Contest link:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline
pd.options.display.max_columns = 999
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split
from IPython.display import display, Image
import geopy.distance
import seaborn as sns
import datetime
import uszipcode
import xgboost
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeRegressor
from joblib import dump, load
from xgboost import plot_importance
import math
import holidays
import shap
import collections
train = pd.read_csv("C:\\Users\\lllll\\Desktop\\Responsible ml\\nyc-taxi-trip-duration\\train\\train.csv")
train.head()
train.info()
train.describe()
print(train['pickup_datetime'].min())
print(train['pickup_datetime'].max())
max_trip = train['trip_duration'].max()
print("Longest trip in minutes: ",max_trip/60)
The longest trip is 59k minutes which is not normal. Might be data error or driver(s) not taking trips while the trip sign is on.
Take a look at the target variable in detail. Only look at trips shorter than 10k seconds.
plt.hist(train['trip_duration'], range = [0, 10000])
Most of the trips are less than 5k seconds. We should be careful when dealing with extreme long trips because those observations might have negative impact on the model.
extreme_data = train[train.trip_duration >= 1e+4]['trip_duration'].count()
print("Number of extreme data: ", extreme_data)
There is only 2123 of trips longer than 10k compares to 1.5 million total data, we should look at those data and decide whether to keep or delete those data.
unique_extreme = train[train.trip_duration >= 1e+4]['trip_duration'].unique()
print("Number of unique extreme data: ", len(unique_extreme))
plt.scatter([i for i in range(len(unique_extreme))], unique_extreme)
We see that there are only 4 super extreme data which are longer than 2 million seconds.
train['trip_duration'].sort_values(ascending = False).head(-5)
super_data_index = train['trip_duration'].sort_values(ascending = False).head(-4)[0:10].index.values
for i in super_data_index:
print(train.iloc[i,])
We see that those trips have a common characteristic: longitude and latitude almost did not change. We can assume that those are not actual trips. The car was parked there while meter was on or something.
Need to come up with a threshold to find out all those outliers.
First try: if longitude and latitude did not change more than 1, and trip duration is greater than 50000 seconds, we can safely assume those are not actual trips.
train['delt_longitude'] = (train['pickup_longitude'] - train['dropoff_longitude']).abs()
train['delt_latitude'] = (train['pickup_latitude'] - train['dropoff_latitude']).abs()
plt.hist(train['delt_longitude'])
plt.show()
plt.hist(train['delt_latitude'])
plt.show()
It is hard to set up a threshold for changes in longitude and latitude since they overlap a lot and some small changes do not mean short distance because of one-way or detour and etc...
First, remove those 4 super extreme data.
train = train.drop(train['trip_duration'].sort_values(ascending = False).head(-4)[0:4].index.values, axis = 0)
len(train)
Look at trips between 10k and 50k seconds.
test_long_duration = train[(train.trip_duration < 50000) & (train.trip_duration > 10000)]
test_long_duration
some trips ends at exactly 00:00:00, I think those are trips that drivers end their shift and they are not actually doing a trip during that time. Should delete them.
Make a copy of the dataframe in case we need the original data or we decide to recover some deleted data.
train1 = train.copy()
Use geopy to calculate geographical distance to assit in cleaning the data.
def create_distance(data):
data = list(data)
x = geopy.distance.distance((data[0], data[1]), (data[2], data[3])).miles
return x
train1['distance'] = list(map(create_distance, train1[['pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude']].values))
Data checkpoint 1.
train1.to_csv("C:\\Users\\lllll\\Desktop\\Responsible ml\\train1.csv", index = None)
Filter out all 00:00:00 ended trips.
dropoff_values = np.array(train1['dropoff_datetime'])
filters = lambda x: x[-8:]
dropoff_values_final = np.array([filters(n) for n in dropoff_values])
exact_index = np.array(np.where(dropoff_values_final == '00:00:00')).reshape(-1,).tolist()
exact_index_list = [i for i in exact_index]
zero_ended_data = train1.iloc[exact_index_list,]
zero_ended_data
plt.hist(zero_ended_data['trip_duration'])
Some seems to be like normal data, some are really long trips. Look at hours greater than 20 hours first.
zero_ended_data[zero_ended_data.trip_duration >= 60*60*20]
These are definitely not real trips. The distance suggests that most of these trips are not valid trips.
All these data should be deteleted.
zeros_to_drop_index = zero_ended_data[zero_ended_data.trip_duration >= 60*60*20].index.values
len(zeros_to_drop_index)
train1 = train1.drop(zeros_to_drop_index, axis = 0)
Look at all trips that have a trip hours greater than 20 hours.
sns.set_style('darkgrid')
sns.scatterplot(x = train1[train1.trip_duration >= 60*60*20]['distance'], y = train1[train1.trip_duration >= 60*60*20]['trip_duration'])
We see a lot of trips have more than 80K seconds duration while the geographic distance are less than 10 miles. It is hard to believe that those are real trips. Therefore, we should delete trips with greater than 20 hours duration while have less than 10 miles of geographic distance.
drop_index_2 = train1[(train1.trip_duration >= 60*60*20) & (train1.distance <= 10)].index.values
len(drop_index_2)
train1 = train1.drop(drop_index_2, axis = 0)
len(train1)
Look at all long duration trips again.
cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True)
sns.scatterplot(x = train1[train1.trip_duration >= 60*60*20]['distance'], y = 'trip_duration', data = train1, hue = 'passenger_count', palette = cmap, size =
'passenger_count', sizes = (20, 200))
But there are a huge amount of trips that are around 10 to 15 miles and have durations greater than 82000 seconds which are longer than 22 hours. We should not include those observations. However, we can keep those long distance trips for now.
drop_index_3 = train1[(train1.trip_duration >= 82000) & (train1.distance <= 25)].index.values
len(drop_index_3)
train1 = train1.drop(drop_index_3, axis = 0)
Look at the overall data trip_duration against distance.
sns.scatterplot(x = 'distance', y = 'trip_duration', data = train1, hue = 'distance', size = 'distance', sizes = (20, 400))
Assume the average speed limit for NYC highway is 55 mph, we still see some trips with impossible long distances. There is no way such trips can be real. Set a speed threshold to clean some additional dirty data.
speed_threshold = 55
sns.scatterplot(x = train1[train1.distance / train1.trip_duration * 60 * 60 >= speed_threshold]['distance'],
y = 'trip_duration', data = train1)
Most of those data are for sure not real trips. Might be GPS error? No idea, but should delete them. However, before doing so, should look at those data that have around 0 distance.
sns.set_style('darkgrid')
scatter = sns.scatterplot(x = train1[(train1.distance / train1.trip_duration * 60 * 60 >= speed_threshold)
& (train1.distance <= 50)]['distance'],
y = train1['trip_duration']/3600,
data = train1)
scatter.plot()
plt.plot([0,50], [0,50/80], c = 'black')
plt.plot([0,50], [0, 50/55], c = 'b')
print("The black line represent a speed of 80 mph")
print("The blue line represent a speed of 55 mph")
print('Points below the black line means the speed is greater than 80 mph')
Clearly all those data have an average speed greater than 55 GEOGRAPHIC MPH. That's not trustworthy. Even some of them might be real (driving on road like NJ Turnpike when there is no traffic and the real distance between the pickup and dropoff locations are close to the geographic distance), since we are predicting NYC trips, those data are still misleading. Therefore, we should delete them.
Since most trips are inside Manhattan, speed greater than 40 mph should be consider invalid.
speed_threshold = 40
drop_index_4 = train1[(train1.distance / train1.trip_duration * 60 * 60 >= speed_threshold)].index.values
len(drop_index_4)
train1 = train1.drop(drop_index_4, axis = 0)
Now take a look at the distribution of trip_duration again
plt.figure(dpi = 128, figsize = (20, 20))
sns.distplot(train1['trip_duration'])
There are still some spikes at greater than 70k seconds. Take a close look again.
sns.scatterplot(x = train1[train1.trip_duration >= 70000]['distance'], y = 'trip_duration', data = train1)
for now, assume distance <= 20 miles are invalid. Drop them.
drop_index_5 = train1[(train1.trip_duration >= 70000) & (train1.distance <= 20)].index.values
len(drop_index_5)
train1 = train1.drop(drop_index_5, axis = 0)
When we look at the speed of some trips, we found that some trips have extremely short duration. First let's look at duration less than 60 seconds.
sns.scatterplot(x = train1[train1.trip_duration <= 60]['distance'], y = 'trip_duration', data = train1, hue = 'passenger_count')
plt.plot([0, 0.6], [0, 60])
plt.plot([0, 0.6], [0, 39], c = 'black')
plt.plot([0, 0.6], [0, 86], c = 'r')
print('The red line represent a speed of 25 mph')
print('The blue line represent a speed of 36 mph')
print('The black line represent a speed of 55 mph')
Since stop-n-go and acceleration take times, trips that are shorter than 60 seconds with speed greater than 25 geographic mph is impossible. We should delete those data
drop_index_6 = train1[(train1.trip_duration <= 60) & (train1.distance / train1.trip_duration * 60 * 60 >= 25)].index.values
len(drop_index_6)
train1 = train1.drop(drop_index_6, axis = 0)
Those short trips can be considered as passengers regreting about taking a cab. Check trips shorter than 40 seconds.
sns.scatterplot(x = train1[train1.trip_duration <= 40]['distance'], y = 'trip_duration', data = train1, hue = 'passenger_count')
I still don't believe someone can drive more than 0.2 miles in less than 40 seconds. Should drop those data. Plus, there are a lot of trips took less than 20 seconds. I don't think those data are useful since those are obviously non-usual trips. Should delete them as well.
drop_index_7 = train1[(train1.trip_duration <= 40) & (train1.distance >= 0.2)].index.values
drop_index_8 = train1[train1.trip_duration <= 20].index.values
print(len(drop_index_7),len(drop_index_8))
train1 = train1.drop(drop_index_7, axis = 0)
train1 = train1.drop(drop_index_8, axis = 0)
Finally, look at the average speed to find other outliers (non-trips).
train1['speed'] = train1['distance'] / train1['trip_duration'] * 3600
plt.figure(dpi = 128, figsize = (12,12))
sns.scatterplot(x = 'speed', y = 'trip_duration', data = train1, size = 'distance', sizes = (20, 400))
There are a lot of trips that last longer than 20k seconds while having a speed of less than 1 miles. Though they might be real trips (maybe someone pay the cabie to wait there or roadblocked), still meaningless data for most usual trips. Drop them.
drop_index_9 = train1[(train1.speed <= 1) & (train1.trip_duration >= 20000)].index.values
len(drop_index_9)
train1 = train1.drop(drop_index_9, axis = 0)
That should cleaned most of the invalid trips. Now look at if there is any trips with 0 passenger.
sns.scatterplot(x = train1[train1.passenger_count == 0]['distance'], y = 'trip_duration', data = train1)
Those trips look valid. Just leave it as 0.
The trip duration to airports (JFK, LGA, EWR) might have a different behavior. Create features (miles to airport from pickup, miles to airpot from dropoff) to find out if that's true.
Collect airports geographic location data.
airport_geo = {'jfk':[40.6413111, -73.7781391], 'lga': [40.7730135746, -73.8702298524], 'ewr':[40.6895314, -74.17446239999998]}
airport_list = ['jfk','lga','ewr']
for i in airport_list:
airport = i
print("Working on airport {}".format(i))
lag_long_pickup = np.array(train1[['pickup_latitude','pickup_longitude']].values).tolist()
lag_long_dropoff = np.array(train1[['dropoff_latitude','dropoff_longitude']].values).tolist()
pick_up_distance = [geopy.distance.distance((lag_long_pickup[n][0],lag_long_pickup[n][1]), (airport_geo[str(airport)][0],airport_geo[str(airport)][1])).miles for n in range(len(lag_long_pickup))]
drop_off_distance = [geopy.distance.distance((lag_long_dropoff[n][0],lag_long_dropoff[n][1]), (airport_geo[str(airport)][0],airport_geo[str(airport)][1])).miles for n in range(len(lag_long_pickup))]
if (len(pick_up_distance) == len(drop_off_distance)) & (len(pick_up_distance) == len(lag_long_pickup)):
print('Successful {}, appending columns to dataframe'.format(i))
train1['{}_miles_pickup'.format(i)] = pick_up_distance
train1['{}_miles_dropoff'.format(i)] = drop_off_distance
train1
sns.scatterplot(train1[train1.jfk_miles_pickup <= 100]['jfk_miles_pickup'], y = 'trip_duration', data = train1)
We can see that trips around jfk pickup, and trips where the pickup location are 15 miles away from jfk, usually have longer duration. Might due to: Going from jfk to Manhattan cost a long time (NYC traffic is the best); 15 miles away from JFK is probably Manhattan thus terrific traffic.
Same pattern can be found on other two airports, graph is not displayed here.
Checkpoint 2
train1.to_csv("C:\\Users\\lllll\\Desktop\\Responsible ml\\cleaned_with_airport.csv", index = None)
Create some day and month categorical data since it is reasonable to assume that traffic has relationships with time.
First, creates date feature without hours and minutes for future merge with external data.
train1['date'] = None
date_values = np.array(train1['pickup_datetime'])
filters = lambda x: x[0:10]
date_values_final = np.array([filters(n) for n in date_values])
len(date_values_final)
date_values_final
train1['date'] = date_values_final
Create hour feature.
hour_values = np.array(train1['pickup_datetime'])
filters = lambda x: x[11:13]
hour_values_final = np.array([filters(n) for n in hour_values])
train1['hour'] = hour_values_final
Workday feature. Use datetime module to automatically label days.
import datetime
def get_day(data):
day_fun = lambda x: datetime.datetime.strptime(x[0:10], '%Y-%m-%d').weekday()
day_values_final = np.array([day_fun(n) for n in data])
train1['day'] = day_values_final
return train1
if __name__ == '__main__':
get_day(train1['pickup_datetime'])
0 represent Monday; 1 represent Tuesday; 2 represent Wednesday; ... 6 represent Sunday.
train1
Weather might also affect traffic. This weather data set comes from:
https://www.kaggle.com/mathijs/weather-data-in-new-york-city-2016
by
Mathijs Waegemakers.
weather = pd.read_csv("C:\\Users\\lllll\\Desktop\\nyc_weather.csv")
weather
Weather data uses 'T' to represent: There is snow/rain but too small to be precisely measured. Thus, convert 'T' into a small number.
def T_converter(x):
if x == 'T':
x = 0.001
return x
for i in weather.columns.values:
weather[i] = list(map(T_converter, weather[i]))
weather
Convert the date column into the same format in our training data.
date_converted = []
def date_converter(data):
weather_date = np.array(data['date'])
date_converter = lambda x: datetime.datetime.strptime(x, '%d-%m-%Y')
date_temp = [date_converter(n).strftime('%Y-%m-%d') for n in weather_date]
data['Date'] = date_temp
return data
if __name__ == '__main__':
date_converter(weather)
Merge the weather data with the training data.
train1 = train1.merge(weather, left_on = 'date', right_on = 'Date', how = 'left')
train1
Clean all extra columns created by pandas during merge. I did not specify this command during merge because I found this is easier can clearer to do so.
train1 = train1.drop(['date_y', 'Date'], axis = 1)
train1 = train1.rename(columns = {'date_x':'date'})
train1
Checkpoint 3.
train1.to_csv("C:\\Users\\lllll\\Desktop\\Responsible ml\\date_weather_added.csv", index = None)
weather.to_csv('C:\\Users\\lllll\\Desktop\\Responsible ml\\modified_weather.csv', index = None)
train1 = pd.read_csv('C:\\Users\\lllll\\Desktop\\Responsible ml\\date_weather_added.csv')
Create another copy.
train2 = train1.copy()
Create month data as well.
month = lambda x: x[5:7]
month_array = np.array([month(n) for n in np.array(train2['pickup_datetime'])])
month_array
train2['month'] = month_array
Create categorical features: airport trip. 1 if pickup or dropoff inside 1.5 miles range from the airport.
def airport_trip():
for i in ['jfk','lga','ewr']:
temp_data = np.array(train2[['{}_miles_pickup'.format(i), '{}_miles_dropoff'.format(i)]].values)
airport_filter = lambda x: 1 if (x[0] <= 1.5) or (x[1] <= 1.5) else 0
airport_trip_cat = [airport_filter(n) for n in temp_data]
train2['{}_trip'.format(i)] = airport_trip_cat
if __name__ == '__main__':
airport_trip()
Create features identifing pickup and dropoff borough use uszpicode module
def create_borough_bound(nyc_zipcode_file):
import uszipcode
import pandas as pd
import numpy as np
import geopy
search = uszipcode.SearchEngine(simple_zipcode = True) # Initiate Zipcode search engine
zipcode_raw = pd.read_excel('C:\\Users\\lllll\\Desktop\\Responsible ml\\nyc_zipcode.xlsx') # NYC borough/neighborhood zipcode data
zip_borough = {} #Split zipcodes into single key
for i in zipcode_raw['Borough'].unique().tolist():
temp_data = zipcode_raw[zipcode_raw.Borough == i]
for n in temp_data['ZIP Codes']:
data = str(n).split(',')
for zips in data:
temp_geo = search.by_zipcode(zips).values()[-4:]
zip_borough[zips] = [i]
"""
For some reasons this search cannot be done inside an array otherwise only the first zipcode will be returned.
Thus need to append longitude and latitude values seperately.
"""
for i in zip_borough:
zip_borough[i].append(search.by_zipcode(int(i)).values()[-4:])
"""
Create a temporary pd dataframe to filter boroughs' bound.
"""
zips = []
boroughs = []
lag_upper = []
lag_lower = []
long_left = []
long_right = []
for i in zip_borough:
zips.append(i)
boroughs.append(zip_borough[i][0])
long_left.append(zip_borough[i][1][0])
long_right.append(zip_borough[i][1][1])
lag_upper.append(zip_borough[i][1][2])
lag_lower.append(zip_borough[i][1][3])
zipcode_bound = pd.DataFrame(data = {'Zipcode':zips,'Borough':boroughs,'Upper_lagitude':lag_upper,'Lower_lagitude':lag_lower,'Left_longitude':long_left,'Right_longitude':long_right})
"""
Create a csv file containing each borough's boudns. These codes are beta codes thus not the most efficient codes.
"""
upper = []
lower = []
left = []
right = []
temp_borough = []
for i in zipcode_raw['Borough'].unique().tolist():
temp_borough.append(i)
upper.append(zipcode_bound[zipcode_bound.Borough == i]['Upper_lagitude'].max())
lower.append(zipcode_bound[zipcode_bound.Borough == i]['Upper_lagitude'].min())
left.append(zipcode_bound[zipcode_bound.Borough == i]['Left_longitude'].min())
right.append(zipcode_bound[zipcode_bound.Borough == i]['Right_longitude'].max())
borough_bound = pd.DataFrame(data = {'Borough':temp_borough,'Upper_lag':upper,'Lower_lag':lower,'Left_long':left,'Right_long':right})
borough_bound.to_csv('C:\\Users\\lllll\\Desktop\\Responsible ml\\Borough_bound.csv', index = None)
bound = pd.read_csv('C:\\Users\\lllll\\Desktop\\Responsible ml\\Borough_bound.csv')
bound
def bound(data):
data = list(data)
if ((float(data[0]) <= 40.877650)
& (float(data[0]) >= 40.709044)
& (float(data[1]) >= -74.047285)
& (float(data[1]) <= -73.910587)):
x = 'Manhattan'
elif ((float(data[0]) <= 40.800888)
& (float(data[0]) >= 40.576215)
& (float(data[1]) >= -73.962795)
& (float(data[1]) <= -73.700837)):
x = 'Queens'
elif ((float(data[0]) <= 40.739446)
& (float(data[0]) >= 40.584575)
& (float(data[1]) >= -74.042492)
& (float(data[1]) <= -73.855660)):
x = 'Brooklyn'
elif ((float(data[0]) <= 40.915282)
& (float(data[0]) >= 40.813731)
& (float(data[1]) >= -73.933406)
& (float(data[1]) <= -73.764661)):
x = 'Bronx'
elif ((float(data[0]) <= 40.649468)
& (float(data[0]) >= 40.520732)
& (float(data[1]) >= -74.255895)
& (float(data[1]) <= -74.051416)):
x = 'Staten Island'
else:
x = 'Outside NYC'
return x
train2['Pickup_borough'] = list(map(bound, train2[['pickup_latitude','pickup_longitude']].values))
train2['Dropoff_borough'] = list(map(bound, train2[['dropoff_latitude','dropoff_longitude']].values))
Traffic in Manhattan also have different pattern, use a distance from the Central Park to try to capture them.
def central_park_distance(data):
data = list(data)
x = geopy.distance.distance((data[0], data[1]), (40.785091, -73.968285)).miles
return x
train2['cp_distance_pickup'] = list(map(central_park_distance, train2[['pickup_latitude', 'pickup_longitude']].values))
train2['cp_distance_dropoff'] = list(map(central_park_distance, train2[['dropoff_latitude', 'dropoff_longitude']].values))
Save new data again.
train2.to_csv('C:\\Users\\lllll\\Desktop\\Responsible ml\\newest_train_beta1.csv', index = None)
train2 = pd.read_csv('C:\\Users\\lllll\\Desktop\\Responsible ml\\newest_train_beta1.csv')
Another copy for checkpoint
train3 = train2.copy()
Add detailed weather data.
knyc = pd.read_csv('C:\\Users\\lllll\\Desktop\\Responsible ml\\KNYC_Metars.csv')
date_filter = lambda x: x[0:10]
knyc_date = [date_filter(n) for n in knyc['Time'].values]
knyc['Date'] = knyc_date
year_filter = lambda x: x[0:4]
knyc_year = [year_filter(n) for n in knyc['Time'].values]
knyc['Year'] = knyc_year
knyc = knyc[knyc.Year == '2016']
time_filter = lambda x: str(x[0:13])+':00:00'
train3_time = [time_filter(n) for n in train3['pickup_datetime'].values]
train3_dropofftime = [time_filter(n) for n in train3['dropoff_datetime'].values]
train3['pickup_time'] = train3_time
train3['dropoff_time'] = train3_dropofftime
train3 = train3.merge(knyc, left_on = 'pickup_time', right_on = 'Time', how = 'left')
train3 = train3.drop(['pickup_time','dropoff_time','Time','Year','Date'], axis = 1)
try1 = pd.read_csv('C:\\Users\\lllll\\Desktop\\Responsible ml\\combined+table+with+neighborhood+and+boro+information.csv')
try1 = try1[['id','pickup_boro','pickup_neighborhood_name','pickup_neighborhood_code','dropoff_neighborhood_name','dropoff_neighborhood_code']]
train3 = train3.merge(try1, on = 'id', how = 'left')
weather_event = ['20160110', '20160113', '20160117', '20160123',
'20160205', '20160208', '20160215', '20160216',
'20160224', '20160225', '20160314', '20160315',
'20160328', '20160329', '20160403', '20160404',
'20160530', '20160628'] # Dates with extreme weather reported by NYC city
weather_event = pd.Series(pd.to_datetime(weather_event, format = '%Y%m%d')).dt.date
train3['extreme_weather'] = train3.date.isin(weather_event).map({True: 1, False: 0})
train3
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cat_cols = ['Events','Conditions','pickup_neighborhood_code','dropoff_neighborhood_code']
for i in cat_cols:
print(i)
train3[i] = train3[i].astype(str)
train3['cat_{}'.format(i)] = le.fit_transform(train3[i])
train3
del knyc
del try1
Replace space with _ for column names.
modify_list = ['maximum temperature','minimum temperature','average temperature','snow fall','snow depth']
modified_dic = {}
def modified_weather(data):
for i in data.columns.values:
if i in modify_list:
modified_dic[i] = str(i).split(' ')[0] + '_' + str(i).split(' ')[1]
else:
modified_dic[i] = i
data = data.rename(columns = modified_dic)
return data
train3 = modified_weather(train3)
Create categorical data identifying trips direction borough-wise.
def going_to_manhattan(data):
data = list(data)
if (data[0] != 'Manhattan') & (data[1] == 'Manhattan'):
x = 'To Manhattan'
elif (data[1] != 'Manhattan') & (data[0] == 'Manhattan'):
x = 'Out Manhattan'
elif (data[1] != 'Outside NYC') & (data[0] == 'Outside NYC') & (data[1] != 'Manhattan'):
x = 'Goin NYC not Manhattan'
elif (data[1] == 'Manhattan') & (data[0] == 'Outside NYC'):
x = 'Goin NYC Manhattan'
elif (data[1] == 'Manhattan') & (data[0] == 'Manhattan'):
x = 'Inside Manhattan'
else:
x = 'Not Manhattan'
return x
train3['from_to'] = list(map(going_to_manhattan, train3[['Pickup_borough','Dropoff_borough']].values))
Create holiday feature using holidays module.
def get_holidays(data):
#data = list(data)
us_holidays = holidays.UnitedStates()
x = us_holidays.get(data)
return x
train3['holiday'] = list(map(get_holidays, train3['date'].values))
Label rush hour.
def get_rushhour(data):
if data in range(6,10):
x = 'morning_rush'
elif data in range(15, 21):
x = 'eve_rush'
else:
x = 'not_rush'
return x
train3['rush_hour'] = list(map(get_rushhour, train3['hour'].values))
Create one-hot data for xgboost because one-hot performs better than normal encoding in xgboost.
def create_onehot(categorical, data):
for cat in categorical:
columns_index = {}
temp = pd.get_dummies(data[cat])
for i in temp.columns.values:
if (cat == 'Pickup_borough') or (cat == 'Dropoff_borough'):
columns_index[i] = '{}_{}'.format(cat[0:-8], i)
else:
columns_index[i] = '{}_{}'.format(cat, i)
temp = temp.rename(columns = columns_index)
data = data.merge(temp, how = 'left', left_index = True, right_index = True)
return data
categorical_list = ['vendor_id','store_and_fwd_flag','workday','month','Pickup_borough','Dropoff_borough','Holiday','rush_hour','airport_pickup','airport_dropoff','Events','Conditions', 'extreme_weather']
train3 = create_onehot(categorical_list, train3)
My computer's memory is 16Gb but still cannot handle one-hot encoded neighborhood name (150+ unique values). Tree model should be still able to find some pattern even if this is not one-hot encoded.
le = LabelEncoder()
for i in ['pickup_neighborhood_name','dropoff_neighborhood_name']:
temp = train3[i]
train3['cat_{}'.format(i)] = le.fit_transform(temp)
route_api = pd.concat([pd.read_csv('C:\\Users\\lllll\\Desktop\\Responsible ml\\fastest_routes_train_part_1.csv'),pd.read_csv('C:\\Users\\lllll\\Desktop\\Responsible ml\\fastest_routes_train_part_2.csv')])
route_api['total_distance'] = route_api['total_distance'] / 1000 / 1.6 # Convert meters to miles
Add routing movements summary for each trip's possible fastest route.
def get_left_turns(data):
x = data.split('|').count('left') + data.split('|').count('slight left') + data.split('|').count('sharp left')
return x
def get_right_turns(data):
x = data.split('|').count('right') + data.split('|').count('slight right') + data.split('|').count('sharp right')
return x
def get_straight_turns(data):
x = data.split('|').count('straight')
return x
def get_onramp(data):
x = data.split('|').count('on ramp')
return x
def get_offramp(data):
x = data.split('|').count('off ramp')
return x
def get_fork(data):
x = data.split('|').count('fork')
return x
def get_endofroad(data):
x = data.split('|').count('end of road')
return x
def get_continue(data):
x = data.split('|').count('continue')
return x
def get_roundabout(data):
x = data.split('|').count('roundabout')
return x
def get_rotary(data):
x = data.split('|').count('rotary')
return x
def get_roundaboutturn(data):
x = data.split('|').count('roundabout turn')
return x
route_api['on_ramps'] = list(map(get_onramp, route_api['step_maneuvers']))
route_api['off_ramps'] = list(map(get_offramp, route_api['step_maneuvers']))
route_api['forks'] = list(map(get_fork, route_api['step_maneuvers']))
route_api['end_roads'] = list(map(get_endofroad, route_api['step_maneuvers']))
route_api['continue'] = list(map(get_continue, route_api['step_maneuvers']))
route_api['roundabout'] = list(map(get_roundabout, route_api['step_maneuvers']))
route_api['rotary'] = list(map(get_rotary, route_api['step_maneuvers']))
route_api['roundaboutturn'] = list(map(get_roundaboutturn, route_api['step_maneuvers']))
route_api['left_turns'] = list(map(get_left_turns, route_api['step_direction']))
route_api['right_turns'] = list(map(get_right_turns, route_api['step_direction']))
route_api['straight_turns'] = list(map(get_straight_turns, route_api['step_direction']))
route_api_final = route_api.copy()
Only merge useful features from the route_api data.
route_api_final = route_api_final[['id','total_distance','total_travel_time','number_of_steps','left_turns',
'right_turns','straight_turns','on_ramps','off_ramps','forks','end_roads','continue'
,'roundabout','rotary','roundaboutturn']]
route_api_final
train3 = train3.merge(route_api_final, on = 'id', how = 'left')
del route_api_final
Fill nan for these categorical features.
for i in train3.columns.values:
if train3[i].isnull().values.any():
print(i, train3[i].isnull().values.sum())
for i in ['Events','Conditions']:
train3[i] = train3[i].fillna('None')
train3 = train3.dropna(how = 'any', axis = 0)
for i in train3.columns.values:
if train3[i].isnull().values.any():
print(i, train3[i].isnull().values.sum())
for i in ['Events','Conditions']:
train3[i] = train3[i].fillna('None')
train3 = train3.dropna(how = 'any', axis = 0)
categorical = ['vendor_id','store_and_fwd_flag','month','Pickup_borough','Dropoff_borough',
'rush_hour','Events','Conditions',
'holiday','pickup_neighborhood_name','dropoff_neighborhood_name','pickup_neighborhood_code',
'dropoff_neighborhood_code','from_to','cat_pickup_neighborhood_code','cat_dropoff_neighborhood_code','day']
train_feature_list = []
"""
Use remove here for manual feature selection
"""
X = [i for i in train3.columns.values]
#X.remove('id')
X.remove('trip_duration')
X.remove('speed')
#X.remove('delt_longitude')
#X.remove('delt_latitude')
X.remove('distance')
X.remove('pickup_datetime')
X.remove('dropoff_datetime')
X.remove('date')
X.remove('Temp.')
X.remove('Windchill')
X.remove('Wind Dir')
X.remove('Precip')
X.remove('pickup_boro')
X.remove('pickup_latitude')
X.remove('pickup_longitude')
X.remove('dropoff_latitude')
X.remove('dropoff_longitude')
X.remove('total_travel_time')
X.remove('cat_pickup_neighborhood_name')
X.remove('cat_dropoff_neighborhood_name')
for i in categorical:
X.remove(i)
y = 'trip_duration'
X
train_feature_list = X
GB tree does not require data scaling, thus create training and validation set directly.
X, y = train3[train_feature_list], train3[y]
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 0)
validation = test_X
validation['trip_duration'] = test_y
train_X, test_X = train_X.drop(['id'], axis = 1), test_X.drop(['id'], axis = 1)
train_feature_list.remove('id')
validation here is for model intepretation.
Checkpoint 4
validation.to_csv('C:\\Users\\lllll\\Desktop\\Responsible ml\\validation_for_ICE.csv', index = None)
train3.to_csv('C:\\Users\\lllll\\Desktop\\Responsible ml\\newest_train_beta2.csv', index = None)
del train3
train3 = pd.read_csv('C:\\Users\\lllll\\Desktop\\Responsible ml\\newest_train_beta2.csv', encoding = 'ISO-8859-1')
xgboost.__version__
Old params for original xgboost module. Not for sckit-learn wrapped xgboost model.
This variable is for storage use. Not for model use.
Parameters below arrived by randomized search. Search codes and outputs are deleted to save space.
Do not run if cuda and cudnn are not installed. GPU xgb model. Only support NVIDIA and requires cuda >= 9.0 plus cudnn.
params = {'tree_method':'gpu_hist', 'reg_lambda':0.8, 'reg_alpha':0.8,
'colsample_bytree':0.7, 'subsample':1, 'min_child_weight':10, 'max_depth':13, 'learning_rate':0.07,
'verbosity':2, 'eval_metric':'rmse', 'seed':0 }
gpu_xgb_model = xgboost.XGBRegressor(gpu_id = 0, tree_method = 'gpu_hist', reg_lambda = 0.8, reg_alpha = 0.8,
colsample_bytree = 0.7, n_estimators = 5000, max_depth = 13, learning_rate = 0.07,
verbosity = 2, booster = 'gbtree', min_child_weight = 10, subsample = 1, random_state = 0,
num_parallel_tree = 1)
Use early_stopping_rounds and eval_metric to prevent overfitting and reduce computation time.
gpu_xgb_model.fit(train_X, train_y, eval_metric = 'rmse', eval_set = [(test_X, test_y, 'validation')],
early_stopping_rounds = 500, verbose = True)
Test RMSE of 288.91968.
This code can convert xgboost 1.1.0 original model (xgboost.train) output to shap module compatible format but DMatrix is not friendly to use. Thus omitted. Code is saved for future use.
# Only run this for xgboost 1.1.0 dmatrix input
#model_bytearray = gpu_xgb_model.save_raw()[4:]
def myfun(self=None):
return model_bytearray
#gpu_xgb_model.save_raw = myfun
feature_imp_dict = {}
imp_list = gpu_xgb_model.feature_importances_
for i in range(len(train_feature_list)):
feature_imp_dict[train_feature_list[i]] = imp_list[i]
test_fea_dic = feature_imp_dict.copy()
sorted_test = sorted(test_fea_dic.items(), key = lambda kv: kv[1], reverse = True)
sorted_importance_list = collections.OrderedDict(sorted_test)
plt.figure(dpi = 128, figsize = (16,16))
plt.barh([i for i in sorted_importance_list.keys()][::-1], [i for i in sorted_importance_list.values()][::-1], height = 0.5,
tick_label = [i for i in sorted_importance_list.keys()][::-1])
plt.yticks(np.arange(len([i for i in sorted_importance_list.keys()][::-1])))
plt.show()
As we can see from the plot of feature importance, the features that affect the most are trip to JFK airpots,totla trip distance, number of steps, trips on weekend, trips to manhantten. It is quite reseasonble to get why there those features are important.
validation_data = pd.read_csv('C:\\Users\\lllll\\Desktop\\Responsible ml\\validation_for_ICE.csv')
def test_ICE(feature, data, model, feature_list, resolution = 20, bins = None):
pd.options.mode.chained_assignment = None
par_dep_frame = pd.DataFrame(columns = [feature, 'partial_dependence'])
col_cache = data.loc[:, feature].copy(deep=True)
if bins is None:
min_ = data[feature].min()
max_ = data[feature].max()
by = (max_ - min_) / resolution
bins = np.arange(min_, (max_ + by), (by + np.round((1. / resolution) * by, 3))) # bins calculation from referenced author
for j, _ in enumerate(bins):
if j + 1 <= len(bins):
data.loc[:, feature] = bins[j] # Not using j+1 because this dataset contains too many extreme data points on the upper side.
par_dep_i = pd.DataFrame(model.predict(data[feature_list]))
par_dep_j = par_dep_i.mean()[0]
par_dep_frame = par_dep_frame.append({feature:bins[j],'partial_dependence': par_dep_j}, ignore_index=True)
data.loc[:, feature] = col_cache
return par_dep_frame
Since there is not specific observations interested in, we simply look at data at each 10th percentile of trip_duration.
def get_percentile(prediction, data, id_):
data_sorted = data.copy(deep = True)
data_sorted = data_sorted.sort_values(prediction)
data_sorted = data_sorted.reset_index()
percentile_dict = {0: data_sorted.loc[0, id_], 99: data_sorted.loc[int(data.shape[0] - 1), id_]}
inc = data_sorted.shape[0] // 10
for i in range(1, 10):
percentile_dict[i * 10] = data_sorted.loc[i * inc, id_]
return percentile_dict
Get percentile first.
xgb_per_dict = get_percentile('prediction_1', validation_data, 'id')
Select some high importances features to compute.
features_to_plot = ['jfk_miles_dropoff','ewr_miles_dropoff','ewr_miles_pickup','jfk_miles_pickup','cp_distance_dropoff','hour','delt_longitude','total_distance','day_3','number_of_steps','day_0','day_6','day_2','day_5']
Compute ICE and partial dependence for selected features.
Computational heavy codes.
ice_dict = {}
for i in features_to_plot:
#print('Working on feature: ', i)
ice_dict[i] = test_ICE(i, validation_data, gpu_xgb_model, train_feature_list)
for i in features_to_plot:
bins = list(ice_dict[i][i])
print('Working on features: ', i)
for j in sorted(xgb_per_dict.keys()):
col_name = 'Percentile_{}'.format(j)
ice_dict[i][col_name] = test_ICE(i, validation_data[validation_data['id'] == str(xgb_per_dict[j])], gpu_xgb_model, train_feature_list, bins = bins)['partial_dependence']
save the dictionary because it takes too long to compute.
test_ice_df = pd.DataFrame()
for i in ice_dict.keys():
for n in ice_dict[i].keys():
if n != str(i):
test_ice_df['{}_{}'.format(i, n)] = ice_dict[i][n]
else:
test_ice_df[n] = ice_dict[i][n]
test_ice_df.to_csv('C:\\Users\\lllll\\Desktop\\Responsible ml\\ice_values_selected.csv', index = None)
def plot_pd_ice(x_name, par_dep_frame, ax=None):
"""
The ax parameters is not really needed, but keep it here just in case want to plot categorical feature with extremly high
cardinality.
If not none, need to modify the second function to group bins.
For now this is disabled because we are not plotting high cardinality data.
"""
if ax is None:
fig, ax = plt.subplots()
par_dep_frame.drop('partial_dependence', axis=1).plot(x=x_name,
colormap='gnuplot',
ax=ax)
par_dep_frame.plot(title = 'Partial Dependence with ICE: ' + x_name,
x=x_name,
y='partial_dependence',
color='grey',
linewidth=2,
ax=ax)
else:
par_dep_frame.drop('partial_dependence', axis=1).plot(x=x_name,
colormap='gnuplot',
ax=ax)
par_dep_frame.plot(title = 'Partial Dependence with ICE: ' + x_name,
x=x_name,
y='partial_dependence',
color='red',
linewidth=2,
ax=ax)
def plot_ICE_par(feature, target, data, ice_dict):
plt.figure(dpi = 300)
fig, (ax, ax2) = plt.subplots(ncols = 2, nrows = 1, sharey = False, sharex = False)
plt.subplots_adjust(left=-0.2, right=1.7, wspace=0.3)
ax.set_xlabel(feature)
ax.set_ylabel('Frequency',fontsize = 14)
ax.set_title('Histogram with Mean ' + target + ' Overlay')
ax.hist(data[feature], color = 'black')
ax1 = ax.twinx()
_ = ax1.set_ylim((ice_dict[feature]['partial_dependence'].min(), ice_dict[feature]['partial_dependence'].max()))
_ = ax1.plot(ice_dict[feature][feature], ice_dict[feature]['partial_dependence'], color='r')
_ = ax1.set_ylabel('Mean ' + target, fontsize = 14)
_ = ax1.yaxis.set_label_coords(1.14, 0.6)
_ = ax1.legend(['Mean ' + target], loc = 0)
plot_pd_ice(feature,ice_dict[feature], ax = ax2)
_ = ax2.legend(bbox_to_anchor=(1.05, 0.1),
loc=3,
borderaxespad = 0)
for i in features_to_plot:
plot_ICE_par(i, 'prediction_1', validation_data, ice_dict)
The plots above sow the expected behavior for the feature selected, PDPs for the taxi duration prediction model and the 14 features. From the PDP, we can see that the largest differences can be seen in the total distance. The longer the total distance is, the longer the trips duration are. And the other features do not show fluctuation on PDP when the value of the features get large or like the categorical feature change from on to other.
As shown in plot above, other than the PDP line, each line represents some percentile of the taxi duration prediction and visualizes the effect of varying the selected feature value on the model’s prediction, given all other features remain constant . An ICE plot can highlight the variation in the fitted values across the range of a feature. This suggests where and to what extent heterogeneities might exist. From the plot above, we notice that most of the features follow the same patter and show no interation with each others.
Set up graphviz enviornment.
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
Instead of using a gbm or xgboost model, use a simple decision tree model here.
To run the code, need to install:
graphviz
pylab
subprocess
def surrogate_interpret(features, target, data, depth = 5, validation = None, graphviz_display = True, plot_path = 'C:\\Users\\lllll\\Desktop\\Responsible ml', name = 'Kaggle_taxi_xgb'):
from graphviz import Source
import pylab
import matplotlib.image as mpimg
from subprocess import check_call
%pylab inline
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
if validation == None:
surrogate = DecisionTreeRegressor(random_state = 0, max_depth = int(depth))
x, y = data[features], data[target]
scores = cross_validate(surrogate, x, y, cv = 5, scoring = 'neg_root_mean_squared_error', return_train_score = True)
surrogate.fit(x, y)
print('Training RMSE for the surrogate model: ', abs(scores['train_score'].mean()),
'\nValidation RMSE for the surrogate model: ', abs(scores['test_score'].mean()))
else:
surrogate = DecisionTreeRegressor(random_state = 0, max_depth = int(depth))
x, y = data[features], data[target]
surrogate.fit(x, y)
validation_rmse = math.sqrt(mse(surrogate.predict(validation[features]), validation[target]))
train_rmse = math.sqrt(mse(surrogate.predict(x), y))
print('Training RMSE for the surrogate model: ', train_rmse,
'\nValidation RMSE for the surrogate model: ', validation_rmse)
dot_file_path = os.path.join(plot_path, 'surrogate_model_{}.dot'.format(name))
print('Saving graphviz dot file to: {}'.format(dot_file_path))
tree_plot = export_graphviz(surrogate, feature_names = features,
class_names = target, out_file = dot_file_path)
png_file_path = os.path.join(plot_path, 'surrogate_model_{}.png'.format(name))
print('Converting dot file to png and saving to: {}'.format(png_file_path))
check_call(['dot', '-Tpng', dot_file_path, '-o', png_file_path]) # Convert graphviz dot file to png file for storage
#display(Image(filename = png_file_path))
if graphviz_display:
return Source.from_file(dot_file_path)
else:
img = mpimg.imread(png_file_path)
plt.figure(dpi = 800, figsize = (20, 20))
imgplot = plt.imshow(img)
plt.show()
surrogate_interpret(features_to_plot, 'prediction_1', validation_data)
Although this is a simple surrogate model, due to the size of the data and the complexity of the problem, the tree model is still a little bit huge. But this still provides us an idea of how the model predicts.
From the surrogate model we get,we could notice that the model do not perform well on the trip longer trips since the mean trip duration of the data is about 1000. We could notice the high value of MSE for the long trip.
Initiate shap visualization function.
shap.initjs()
explain = shap.TreeExplainer(gpu_xgb_model)
import time
#features_to_use = [i for i in sorted_importance_list.keys()][-1:-11]
data_to_use = validation_data[train_feature_list]
start = time.time()
shap_value = explain.shap_values(data_to_use, approximate = True)
end = time.time()
print('Samples: ', len(data_to_use), 'time: ', end - start)
ave_shap_dict = {}
for i in range(len(train_feature_list)):
ave_shap_dict[train_feature_list[i]] = np.mean(shap_value[:,i])
ave_shap_df = pd.DataFrame(data = {'feature':[i for i in ave_shap_dict.keys()], 'mean_shap_value':[i for i in ave_shap_dict.values()]})
ave_shap_df = ave_shap_df.sort_values(by = 'mean_shap_value', ascending = False)
Complete global shapley value plot.
plt.figure(dpi = 300, figsize = (12,12))
sns.set_style('darkgrid')
splot = sns.barplot(y = 'feature', x = 'mean_shap_value', data = ave_shap_df, orient = 'h')
for p in splot.patches:
width = p.get_width()
plt.text(np.max(np.array(ave_shap_df['mean_shap_value']))/25 + p.get_width(), p.get_y() + 0.55 * p.get_height(),
'{:1.5f}'.format(width),
ha='center', va='center')
The plot above show the Global Shapley for each feature in our model. SHAP feature importance measured as the mean absolute Shapley values. As we can see,JFK_miles_dropoff is still the most important one, follow with ewr_miles_dropoff, ewr_miles_pickup, jfk_miles_pickup and so on.
Global Shapley plot.
shap.summary_plot(shap_value, data_to_use, plot_type = 'bar')
Compared with the previous figure based on feature importance, it shows slight differences. The feature rank the the top becomes jfk_miles_dropoff, total distance rank lower than the one in feature importance plot.
Feature-wise shapley value.
from IPython.display import clear_output
for i, values in enumerate(features_to_plot):
shap.dependence_plot(values, shap_value, data_to_use, alpha = 0.2, dot_size = 5, show = False)
plt.title('{}: {} shap dependence plot'.format(i, values))
plt.ylabel('Shap value for feature {}'.format(values))
plt.show()
For all the plots above, each plot the y-axix on the left is the SHAP value, the feature on the right is the correlated featrue selected by the algorithm. The red color represent the higher the feature is. From the plot above, we notice that as hour get large(approach the night time) there are longer trip occur with higher shape value. And most of the jfk_mile_dropoff lies between 10-13, and there is also a spike around 0, there also show high shapely value.
Local (sample-wise) shapley explain.
Random samples to see:
for i in range(0, 15):
random_sample = np.random.randint(0, len(validation_data)+1, 1)
print('Displaying local shapley explaination for observation {}'.format(random_sample))
print('Model output: {}'.format(float(validation_data.iloc[random_sample, -1].values)))
display(shap.force_plot(explain.expected_value, shap_value[random_sample,:], data_to_use.iloc[random_sample,:]))
The plots above explain the contribution of each features on each selected trips.The feature that increase the prediction from base value to model input is shown read, otherwise shown in blue.As we can see from the all the plots above, for the individual trip the features that contribute most are mostly jfk_miles_dropoff, delt_longitude,total_distance which are mostly the features with high global shapeley value
ICE and partial dependence plots indicate that the model behaves strange at percentile 0 and 99. Take a look at those samples.
for i in [0, 99]:
print('Displaying local shapley explaination for observation percentile {}'.format(i))
index = validation_data[validation_data.id == xgb_per_dict[i]].index.values[0]
print('Model output: {}'.format(float(validation_data.iloc[index, -1])))
display(shap.force_plot(explain.expected_value, shap_value[index,:], data_to_use.iloc[index, :]))
For the 0 percentile sample, we see that delt latitude and delt longitude are very small thus having a huge negative impact on the trip duration. However, we see that the trip has 5 right turns thus the trip still has some valid distance. In the future we might need to consider dropping the delt latitude(longitude) or add more training data that contain trips like this one.
For the 99 percentile sample, we see that cp_distance_dropoff and delt_latitude contributed hugely on the long trip duration. Same as the above sample, we might need to consider adding more training samples or removing delt longitude(latitude).
test_sub = pd.read_csv('C:\\Users\\lllll\\Desktop\\Responsible ml\\sub_modified.csv')
le = LabelEncoder()
for i in ['pickup_neighborhood_name','dropoff_neighborhood_name']:
temp = test_sub[i]
test_sub['cat_{}'.format(i)] = le.fit_transform(temp)
new_xgb = gpu_xgb_model.predict(test_sub[train_feature_list])
test_sub['trip_duration'] = new_xgb
def convert_0_trip(distance, time, lat, long, pre):
if distance == 0 and time == 0 and lat <= 0.00001 and long <= 0.00001:
x = 0
else:
x = pre
return x
test_sub['trip_duration_1'] = test_sub.apply(lambda x: convert_0_trip(x['total_distance'], x['total_travel_time'], x['delt_latitude'], x['delt_longitude'], x['trip_duration']), axis = 1)
def convert_negative_trip(pre):
if pre < 0:
x = 0
else:
x = pre
return x
test_sub['trip_duration_1'] = test_sub.apply(lambda x: convert_negative_trip(x['trip_duration']), axis = 1)
def create_sub_file(sub_data, beta, model, feature):
sub_file = sub_data[['id','trip_duration_1']]
sub_file = sub_file.rename(columns = {'id':'id','trip_duration_1':'trip_duration'})
sub_file.to_csv('C:\\Users\\lllll\\Desktop\\Responsible ml\\new_xgb_beta{}.csv'.format(beta), index = None)
with open('C:\\Users\\lllll\\Desktop\\Responsible ml\\xgb_record.txt','a') as xgb_param:
context = str(str(model.get_params)+'\n'+str(feature)+'\n\n')
xgb_param.write(context)
xgb_param.close()
create_sub_file(test_sub, 1, gpu_xgb_model, train_feature_list)
import pickle
import xgboost
pickle.dump(gpu_xgb_model, open('C:\\Users\\lllll\\Desktop\\Responsible ml\\best_gpu_xgb.pkl', "wb"))
gpu_xgb_model = pickle.load(open('C:\\Users\\lllll\\Desktop\\Responsible ml\\best_gpu_xgb.pkl', "rb"))
gpu_xgb_model.get_params
mean_to_fill = []
bounds = [i/10000 for i in range(0, 125, 5)]
for i in range(len(bounds)):
if i != len(bounds) - 1:
if i == 0:
upper = float(bounds[i+1])
lower = float(bounds[i])
temp = train[(train.delt_longitude <= upper) & (train.delt_latitude <= upper)]
temp_mean = temp[(train.delt_longitude > lower) & (train.delt_latitude > lower)]['trip_duration'].mean()
mean_to_fill.append(temp_mean)
else:
upper = float(bounds[i+1])
lower = float(bounds[i])
temp = train[(train.delt_longitude <= upper) & (train.delt_latitude <= upper)]
temp_mean = temp[(train.delt_longitude >= lower) & (train.delt_latitude >= lower)]['trip_duration'].mean()
mean_to_fill.append(temp_mean)
else:
lower = float(bounds[i])
temp = temp = train[(train.delt_longitude <= 0.0125) & (train.delt_latitude <= 0.0125)]
temp_mean = temp[(train.delt_longitude >= lower) & (train.delt_latitude >= lower)]['trip_duration'].mean()
mean_to_fill.append(temp_mean)
Sample submission: Preprocess submission data.
weather = pd.read_csv('C:\\Users\\lllll\\Desktop\\Responsible ml\\modified_weather.csv')
weather = weather.drop(['Unnamed: 0'], axis = 1)
sub = pd.read_csv('C:\\Users\\lllll\\Desktop\\Responsible ml\\sub_modified.csv')
sub
sub['delt_longitude'] = (sub['pickup_longitude'] - sub['dropoff_longitude']).abs()
sub['delt_latitude'] = (sub['pickup_latitude'] - sub['dropoff_latitude']).abs()
geo_dist = lambda x: geopy.distance.distance((x[0], x[1]), (x[2], x[3])).miles
sub_dist = sub[['pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude']].values
try_dist = [geo_dist(n) for n in sub_dist]
sub['distance'] = try_dist
jfk = [40.6413111, -73.7781391]
lga = [40.7730135746, -73.8702298524]
ewr = [40.6895314, -74.17446239999998]
jfk_dist = lambda x: geopy.distance.distance((x[0], x[1]), (jfk[0], jfk[1])).miles
jfk_miles_pickup, jfk_miles_dropoff = [jfk_dist(n) for n in sub[['pickup_latitude','pickup_longitude']].values], [jfk_dist(n) for n in sub[['dropoff_latitude','dropoff_longitude']].values]
lga_dist = lambda x: geopy.distance.distance((x[0], x[1]), (lga[0], lga[1])).miles
lga_miles_pickup, lga_miles_dropoff = [lga_dist(n) for n in sub[['pickup_latitude','pickup_longitude']].values], [lga_dist(n) for n in sub[['dropoff_latitude','dropoff_longitude']].values]
ewr_dist = lambda x: geopy.distance.distance((x[0], x[1]), (ewr[0], ewr[1])).miles
ewr_miles_pickup, ewr_miles_dropoff = [ewr_dist(n) for n in sub[['pickup_latitude','pickup_longitude']].values], [ewr_dist(n) for n in sub[['dropoff_latitude','dropoff_longitude']].values]
sub['jfk_miles_pickup'] = jfk_miles_pickup
sub['jfk_miles_dropoff'] = jfk_miles_dropoff
sub['lga_miles_pickup'] = lga_miles_pickup
sub['lga_miles_dropoff'] = lga_miles_dropoff
sub['ewr_miles_pickup'] = ewr_miles_pickup
sub['ewr_miles_dropoff'] = ewr_miles_dropoff
sub_hour_values = np.array(sub['pickup_datetime'])
sub_filters = lambda x: x[11:13]
sub_hour_values_final = np.array([sub_filters(n) for n in sub_hour_values])
sub['hour'] = sub_hour_values_final
def get_sub_day(data):
day_fun = lambda x: datetime.datetime.strptime(x[0:10], '%Y-%m-%d').weekday()
day_values_final = np.array([day_fun(n) for n in data])
sub['day'] = day_values_final
return sub
if __name__ == '__main__':
get_sub_day(sub['pickup_datetime'])
date_filter = lambda x: x[0:10]
sub['date'] = np.array([date_filter(n) for n in sub['pickup_datetime']])
sub['Date'] = sub['date']
sub = sub.drop(['date'], axis = 1)
weather
sub = sub.merge(weather, on = 'Date', how = 'left')
month = lambda x: x[5:7]
month_array = np.array([month(n) for n in np.array(sub['pickup_datetime'])])
month_array
sub['month'] = month_array
def sub_airport_trip():
for i in ['jfk','lga','ewr']:
temp_data = np.array(sub[['{}_miles_pickup'.format(i), '{}_miles_dropoff'.format(i)]].values)
airport_filter = lambda x: 1 if (x[0] <= 1.5) or (x[1] <= 1.5) else 0
airport_trip_cat = [airport_filter(n) for n in temp_data]
sub['{}_trip'.format(i)] = airport_trip_cat
if __name__ == '__main__':
sub_airport_trip()
sub['Pickup_borough'] = list(map(bound, sub[['pickup_latitude','pickup_longitude']].values))
sub['Dropoff_borough'] = list(map(bound, sub[['dropoff_latitude','dropoff_longitude']].values))
def central_park_distance(data):
data = list(data)
x = geopy.distance.distance((data[0], data[1]), (40.785091, -73.968285)).miles
return x
sub['cp_distance_pickup'] = list(map(central_park_distance, sub[['pickup_latitude', 'pickup_longitude']].values))
sub['cp_distance_dropoff'] = list(map(central_park_distance, sub[['dropoff_latitude', 'dropoff_longitude']].values))
sub.columns.values
cols_right_order = train2.columns.values
cols_right_order
sub = sub.drop(['Date'])
sub = pd.read_csv('C:\\Users\\lllll\\Desktop\\Responsible ml\\sub_modified.csv')
knyc = pd.read_csv('C:\\Users\\lllll\\Desktop\\Responsible ml\\KNYC_Metars.csv')
date_filter = lambda x: x[0:10]
knyc_date = [date_filter(n) for n in knyc['Time'].values]
knyc['Date'] = knyc_date
year_filter = lambda x: x[0:4]
knyc_year = [year_filter(n) for n in knyc['Time'].values]
knyc['Year'] = knyc_year
knyc = knyc[knyc.Year == '2016']
time_filter = lambda x: str(x[0:13])+':00:00'
sub_time = [time_filter(n) for n in sub['pickup_datetime'].values]
#sub_dropofftime = [time_filter(n) for n in sub['dropoff_datetime'].values]
sub['pickup_time'] = sub_time
#sub['dropoff_time'] = sub_dropofftime
sub = sub.merge(knyc, left_on = 'pickup_time', right_on = 'Time', how = 'left')
sub = sub.drop(['pickup_time'], axis = 1)
try1 = pd.read_csv('C:\\Users\\lllll\\Desktop\\Responsible ml\\combined+table+with+neighborhood+and+boro+information.csv')
try1 = try1[['id','pickup_boro','pickup_neighborhood_name','pickup_neighborhood_code','dropoff_neighborhood_name','dropoff_neighborhood_code']]
sub = sub.merge(try1, on = 'id', how = 'left')
weather_event = ['20160110', '20160113', '20160117', '20160123',
'20160205', '20160208', '20160215', '20160216',
'20160224', '20160225', '20160314', '20160315',
'20160328', '20160329', '20160403', '20160404',
'20160530', '20160628']
weather_event = pd.Series(pd.to_datetime(weather_event, format = '%Y%m%d')).dt.date
sub['extreme_weather'] = sub.Date_x.isin(weather_event).map({True: 1, False: 0})
sub
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cat_cols = ['Events','Conditions','pickup_neighborhood_code','dropoff_neighborhood_code']
for i in cat_cols:
print(i)
sub[i] = sub[i].astype(str)
sub['cat_{}'.format(i)] = le.fit_transform(sub[i])
sub
def create_date(data):
x = data[0:10]
return x
sub['Date'] = list(map(create_date, sub['pickup_datetime']))
weather_event = ['20160110', '20160113', '20160117', '20160123',
'20160205', '20160208', '20160215', '20160216',
'20160224', '20160225', '20160314', '20160315',
'20160328', '20160329', '20160403', '20160404',
'20160530', '20160628']
weather_event = pd.Series(pd.to_datetime(weather_event, format = '%Y%m%d')).dt.date
sub['extreme_weather'] = sub.Date.isin(weather_event).map({True: 1, False: 0})
def extreme_weather(data):
weather_event = ['2016-01-10', '2016-01-13', '2016-01-17', '2016-01-23',
'2016-02-05', '2016-02-08', '2016-02-15', '2016-02-16',
'2016-02-24', '2016-02-25', '2016-03-14', '2016-03-15',
'2016-03-28', '2016-03-29', '2016-04-03', '2016-04-04',
'2016-05-30', '2016-06-28']
if data in weather_event:
x = 1
else:
x = 0
return x
sub['extreme_weather'] = list(map(extreme_weather, sub['Date']))
modify_list = ['maximum temperature','minimum temperature','average temperature','snow fall','snow depth']
modified_dic = {}
def modified_weather(data):
for i in data.columns.values:
if i in modify_list:
modified_dic[i] = str(i).split(' ')[0] + '_' + str(i).split(' ')[1]
else:
modified_dic[i] = i
data = data.rename(columns = modified_dic)
return data
sub = modified_weather(sub)
def going_to_manhattan(data):
data = list(data)
if (data[0] != 'Manhattan') & (data[1] == 'Manhattan'):
x = 'To Manhattan'
elif (data[1] != 'Manhattan') & (data[0] == 'Manhattan'):
x = 'Out Manhattan'
elif (data[1] != 'Outside NYC') & (data[0] == 'Outside NYC') & (data[1] != 'Manhattan'):
x = 'Goin NYC not Manhattan'
elif (data[1] == 'Manhattan') & (data[0] == 'Outside NYC'):
x = 'Goin NYC Manhattan'
elif (data[1] == 'Manhattan') & (data[0] == 'Manhattan'):
x = 'Inside Manhattan'
else:
x = 'Not Manhattan'
return x
sub['from_to'] = list(map(going_to_manhattan, sub[['Pickup_borough','Dropoff_borough']].values))
def get_holidays(data):
#data = list(data)
us_holidays = holidays.UnitedStates()
x = us_holidays.get(data)
return x
sub['holiday'] = list(map(get_holidays, sub['date'].values))
def get_rushhour(data):
if data in range(6,10):
x = 'morning_rush'
elif data in range(15, 21):
x = 'eve_rush'
else:
x = 'not_rush'
return x
sub['rush_hour'] = list(map(get_rushhour, sub['hour'].values))
def create_onehot(categorical, data):
for cat in categorical:
columns_index = {}
temp = pd.get_dummies(data[cat])
for i in temp.columns.values:
if (cat == 'Pickup_borough') or (cat == 'Dropoff_borough'):
columns_index[i] = '{}_{}'.format(cat[0:-8], i)
else:
columns_index[i] = '{}_{}'.format(cat, i)
temp = temp.rename(columns = columns_index)
data = data.merge(temp, how = 'left', left_index = True, right_index = True)
return data
categorical_list = ['vendor_id','store_and_fwd_flag','day','month','Pickup_borough','Dropoff_borough','from_to','holiday','rush_hour']
sub = create_onehot(categorical_list, sub)
#for i in sub.columns.values:
#if i[0:3] == 'cat':
#sub = sub.drop([i], axis = 1)
sub['Windchill'] = sub['Windchill'].fillna(0)
sub = sub.fillna('None')
def get_left_turns(data):
x = data.split('|').count('left') + data.split('|').count('slight left') + data.split('|').count('sharp left')
return x
def get_right_turns(data):
x = data.split('|').count('right') + data.split('|').count('slight right') + data.split('|').count('sharp right')
return x
def get_straight_turns(data):
x = data.split('|').count('straight')
return x
def get_onramp(data):
x = data.split('|').count('on ramp')
return x
def get_offramp(data):
x = data.split('|').count('off ramp')
return x
def get_fork(data):
x = data.split('|').count('fork')
return x
def get_endofroad(data):
x = data.split('|').count('end of road')
return x
def get_continue(data):
x = data.split('|').count('continue')
return x
def get_roundabout(data):
x = data.split('|').count('roundabout')
return x
def get_rotary(data):
x = data.split('|').count('rotary')
return x
def get_roundaboutturn(data):
x = data.split('|').count('roundabout turn')
return x
route_api_sub = pd.read_csv('C:\\Users\\lllll\\Desktop\\Responsible ml\\fastest_routes_test.csv')
route_api_sub['total_distance'] = route_api_sub['total_distance'] / 1000 / 1.6
route_api_sub['on_ramps'] = list(map(get_onramp, route_api_sub['step_maneuvers']))
route_api_sub['off_ramps'] = list(map(get_offramp, route_api_sub['step_maneuvers']))
route_api_sub['forks'] = list(map(get_fork, route_api_sub['step_maneuvers']))
route_api_sub['end_roads'] = list(map(get_endofroad, route_api_sub['step_maneuvers']))
route_api_sub['continue'] = list(map(get_continue, route_api_sub['step_maneuvers']))
route_api_sub['roundabout'] = list(map(get_roundabout, route_api_sub['step_maneuvers']))
route_api_sub['rotary'] = list(map(get_rotary, route_api_sub['step_maneuvers']))
route_api_sub['roundaboutturn'] = list(map(get_roundaboutturn, route_api_sub['step_maneuvers']))
route_api_sub['left_turns'] = list(map(get_left_turns, route_api_sub['step_direction']))
route_api_sub['right_turns'] = list(map(get_right_turns, route_api_sub['step_direction']))
route_api_sub['straight_turns'] = list(map(get_straight_turns, route_api_sub['step_direction']))
none_list = []
for i in feature_list:
if 'None' in sub[i].values:
print(i)
none_list.append(i)
for i in none_list:
sub[i] = sub[i].replace('None', 0)
sub = sub.merge(route_api_sub[['id','total_distance','total_travel_time','number_of_steps','left_turns',
'right_turns','straight_turns','on_ramps','off_ramps','forks','end_roads','continue'
,'roundabout','rotary','roundaboutturn']], on = 'id', how = 'left')
sub.to_csv('C:\\Users\\lllll\\Desktop\\Responsible ml\\sub_modified.csv', index = None)
del sub